//User-written commands to be installed
// distinct
// did_imputation
// ppmlhdfe


//load dataset
clear all
cd "`c(pwd)'"
import delimited "./scientist_panel.csv", clear

//Define treatment year and treatment indicator
gen treatment_year=outcome_year
gen treatment_post=treated*(pub_year>=treatment_year)

gen year_rel_treatment=pub_year-treatment_year
gen treatment_timing=treatment_year*treated
replace treatment_timing=. if treated==0
//generate scientist and incident identifier
encode incident, gen(incident_num)
gen scientist_id=incident_num*100+treated

//define dependent variables
gen arsinh_pub_count=ln(pub_count+sqrt(1+pub_count^2))
gen arsinh_female_coauthors=asinh(num_female_coauthors)
gen num_male_coauthors=num_coauthors-num_female_coauthors
gen arsinh_male_coauthors=asinh(num_male_coauthors)
gen share_female_coauthors=num_female_coauthors/num_coauthors
gen female_coauthors_per_paper=num_female_coauthors/pub_count
gen coauthors_per_paper=num_coauthors/pub_count

//define treatment dummies relative to treatment date (for leads and lags specification)
gen treatment_m10=treated*(pub_year<=treatment_year-10)
gen treatment_m9=treated*(pub_year==treatment_year-9)
gen treatment_m8=treated*(pub_year==treatment_year-8)
gen treatment_m7=treated*(pub_year==treatment_year-7)
gen treatment_m6=treated*(pub_year==treatment_year-6)
gen treatment_m5=treated*(pub_year==treatment_year-5)
gen treatment_m4=treated*(pub_year==treatment_year-4)
gen treatment_m3=treated*(pub_year==treatment_year-3)
gen treatment_m2=treated*(pub_year==treatment_year-2)
gen treatment_m1=treated*(pub_year==treatment_year-1)
gen treatment_0=treated*(pub_year==treatment_year)
gen treatment_1=treated*(pub_year==treatment_year+1)
gen treatment_2=treated*(pub_year==treatment_year+2)
gen treatment_3=treated*(pub_year==treatment_year+3)
gen treatment_4=treated*(pub_year==treatment_year+4)
gen treatment_5=treated*(pub_year==treatment_year+5)
gen treatment_6=treated*(pub_year==treatment_year+6)
gen treatment_7=treated*(pub_year==treatment_year+7)
gen treatment_8=treated*(pub_year==treatment_year+8)
gen treatment_9=treated*(pub_year==treatment_year+9)
gen treatment_10=treated*(pub_year>=treatment_year+10)
gen treatment_10_2=treated*(pub_year==treatment_year+10)
gen treatment_11=treated*(pub_year>=treatment_year+11)

//Experience
gen experience_y=pub_year-author_first_pubyear


//specicify leads and lags model
global leads_and_lags "treatment_m10 treatment_m9 treatment_m8 treatment_m7 treatment_m6 treatment_m5 treatment_m4 treatment_m3 treatment_m2 treatment_0 treatment_1 treatment_2 treatment_3 treatment_4 treatment_5 treatment_6 treatment_7 treatment_8 treatment_9 treatment_10"

///////////////////////
//Publication Output: Leads and Lags

preserve
keep if year_rel_treatment<=10 & year_rel_treatment>=-10

quietly did_imputation arsinh_pub_count scientist_id pub_year treatment_timing, fe(scientist_id pub_year experience_y) autosample cluster(scientist_id) pretrends(4) horizons(0/10)
esttab using "did_leads_and_lags_pubs.csv", se replace
eststo clear

file open myfile using "did_leads_and_lags_pubs.txt", write replace
file write myfile "pub_mean,n_obs,d_scientists,d_incidents"_n
sum pub_count, detail
file write myfile (r(mean)) ","
distinct scientist_id
file write myfile (r(N)) ","
file write myfile (r(ndistinct)) ","
distinct incident
file write myfile (r(ndistinct)) _n
file close myfile

quietly did_imputation arsinh_pub_count scientist_id pub_year treatment_timing, fe(scientist_id pub_year experience_y) autosample cluster(scientist_id)
esttab using "did_pubs_pre_post.csv", se replace
eststo clear

restore


///////////////////////
// Coauthors, female and male

preserve
keep if year_rel_treatment<=10 & year_rel_treatment>=-10

eststo: quietly did_imputation arsinh_female_coauthors scientist_id pub_year treatment_timing, fe(scientist_id pub_year experience_y) autosample cluster(scientist_id)
eststo: quietly did_imputation arsinh_male_coauthors scientist_id pub_year treatment_timing, fe(scientist_id pub_year experience_y) autosample cluster(scientist_id)
esttab using "did_coauthors.csv", se replace
eststo clear

file open myfile using "did_coauthors.txt", write replace
file write myfile "spec,num_coauthors_mean,n_obs,d_scientists,d_incidents"_n
sum num_female_coauthors, detail
file write myfile "femalecoauthors," (r(mean)) ","
distinct scientist_id
file write myfile (r(N)) ","
file write myfile (r(ndistinct)) ","
distinct incident
file write myfile (r(ndistinct)) _n
sum num_male_coauthors, detail
file write myfile "malecoauthors," (r(mean)) ","
distinct scientist_id
file write myfile (r(N)) ","
file write myfile (r(ndistinct)) ","
distinct incident
file write myfile (r(ndistinct)) _n
file close myfile

restore

////////
// Analysis on incidents that continue to publish

preserve
keep if year_rel_treatment<=9 & year_rel_treatment>=-10

gen pub_count_prior=pub_count*(pub_year<treatment_year)
gen pub_count_after=pub_count*(pub_year>=treatment_year)

gen pub_count_qw_prior=pub_wcount*(pub_year<treatment_year)
gen pub_count_qw_after=pub_wcount*(pub_year>=treatment_year)

gen num_coauthors_prior=num_coauthors*(pub_year<treatment_year)
gen num_coauthors_after=num_coauthors*(pub_year>=treatment_year)

gen num_female_coauthors_prior=num_female_coauthors*(pub_year<treatment_year)
gen num_female_coauthors_after=num_female_coauthors*(pub_year>=treatment_year)

gen num_male_coauthors_prior=num_male_coauthors*(pub_year<treatment_year)
gen num_male_coauthors_after=num_male_coauthors*(pub_year>=treatment_year)

collapse (max) treated incident_num (sum) pub_count_prior pub_count_after pub_count_qw_prior pub_count_qw_after num_coauthors_prior num_coauthors_after num_female_coauthors_prior num_female_coauthors_after num_male_coauthors_prior num_male_coauthors_after, by(scientist_id)

gen treated_avail_prior=(pub_count_prior>0)*treated
gen treated_avail_after=(pub_count_after>0)*treated
gen control_avail_prior=(pub_count_prior>0)*(treated==0)
gen control_avail_after=(pub_count_after>0)*(treated==0)

egen eligible_1 = max(treated_avail_prior), by(incident_num)
egen eligible_2 = max(treated_avail_after), by(incident_num)
egen eligible_3 = max(control_avail_prior), by(incident_num)
egen eligible_4 = max(control_avail_after), by(incident_num)

gen eligible = eligible_1 & eligible_2 & eligible_3 & eligible_4
keep if eligible==1

// Change in number of coauthors

gen avg_num_coauthors_prior=num_coauthors_prior/pub_count_prior
gen avg_num_coauthors_after=num_coauthors_after/pub_count_after
gen d_avg_num_coauthors=avg_num_coauthors_after-avg_num_coauthors_prior

reg avg_num_coauthors_after treated avg_num_coauthors_prior
reg d_avg_num_coauthors treated

gen avg_num_female_coauthors_prior=num_female_coauthors_prior/pub_count_prior
gen avg_num_female_coauthors_after=num_female_coauthors_after/pub_count_after
gen d_avg_num_female_coauthors=avg_num_female_coauthors_after-avg_num_female_coauthors_prior

reg avg_num_female_coauthors_after treated avg_num_female_coauthors_prior
reg d_avg_num_female_coauthors treated

gen avg_num_male_coauthors_prior=num_male_coauthors_prior/pub_count_prior
gen avg_num_male_coauthors_after=num_male_coauthors_after/pub_count_after
gen d_avg_num_male_coauthors=avg_num_male_coauthors_after-avg_num_male_coauthors_prior

reg avg_num_male_coauthors_after treated avg_num_male_coauthors_prior
reg d_avg_num_male_coauthors treated

// Publication quality

gen avg_quality_prior=pub_count_qw_prior/pub_count_prior
gen avg_quality_after=pub_count_qw_after/pub_count_after
gen d_avg_quality=avg_quality_after-avg_quality_prior

gen d_pub_count=pub_count_after-pub_count_prior
gen d_pub_count_qw=pub_count_qw_after-pub_count_qw_prior

reg avg_quality_after treated avg_quality_prior
reg d_avg_quality treated

reg pub_count_after treated pub_count_prior
reg d_pub_count treated

reg pub_count_qw_after treated pub_count_qw_prior
reg d_pub_count_qw treated

restore


/////////
//Female Co-Authors

gen num_new_female_coauthors=num_female_coauthors-num_old_female_coauthors

preserve
//focus on 5-year window around treatment
keep if (treatment_year-pub_year<=5) & (treatment_year-pub_year>-4)

gen num_coauthors_prior=num_coauthors*(pub_year<treatment_year)
gen num_coauthors_after=num_coauthors*(pub_year>=treatment_year)

gen pub_count_prior=pub_count*(pub_year<treatment_year)
gen pub_count_after=pub_count*(pub_year>=treatment_year)

gen num_female_coauthors_prior=num_female_coauthors*(pub_year<treatment_year)
gen num_female_coauthors_after=num_female_coauthors*(pub_year>=treatment_year)

gen num_old_female_coauthors_after=num_old_female_coauthors*(pub_year>=treatment_year)

gen num_new_female_coauthors_after=num_new_female_coauthors*(pub_year>=treatment_year)

//collapse panel to cross section
collapse (max) treated incident_num (sum) pub_count_prior pub_count_after num_coauthors_prior num_coauthors_after num_female_coauthors_prior num_female_coauthors_after num_old_female_coauthors_after num_new_female_coauthors_after, by(scientist_id)

//per-paper analysis

//eligibile cases must:
// - have non-zero publication before and after
// - for both, control and treated scientist

gen treated_avail_prior=(pub_count_prior>0)*treated
gen treated_avail_after=(pub_count_after>0)*treated
gen control_avail_prior=(pub_count_prior>0)*(treated==0)
gen control_avail_after=(pub_count_after>0)*(treated==0)

egen eligible_1 = max(treated_avail_prior), by(incident_num)
egen eligible_2 = max(treated_avail_after), by(incident_num)
egen eligible_3 = max(control_avail_prior), by(incident_num)
egen eligible_4 = max(control_avail_after), by(incident_num)

gen eligible = eligible_1 & eligible_2 & eligible_3 & eligible_4

gen fem_pp_prior=num_female_coauthors_prior/pub_count_prior
gen fem_pp_after=num_female_coauthors_after/pub_count_after

gen old_fem_pp_after=num_old_female_coauthors_after/pub_count_after
gen new_fem_pp_after=num_new_female_coauthors_after/pub_count_after


file open myfile using "fem_coauthors.txt", write replace
file write myfile "n_obs,d_scientists,d_incidents,mean_prior_c,mean_after_c,mean_prior_t,mean_after_t,mean_after_new_c,mean_after_old_c, mean_after_new_t, mean_after_old_t"_n
distinct scientist_id
file write myfile (r(N)) ","
file write myfile (r(ndistinct)) ","
distinct incident_num
file write myfile (r(ndistinct)) ","
sum num_female_coauthors_prior if treated==0
file write myfile (r(mean)) ","
sum num_female_coauthors_after if treated==0
file write myfile (r(mean)) ","
sum num_female_coauthors_prior if treated==1
file write myfile (r(mean)) ","
sum num_female_coauthors_after if treated==1
file write myfile (r(mean)) ","
sum num_new_female_coauthors_after if treated==0
file write myfile (r(mean)) ","
sum num_old_female_coauthors_after if treated==0
file write myfile (r(mean)) ","
sum num_new_female_coauthors_after if treated==1
file write myfile (r(mean)) ","
sum num_old_female_coauthors_after if treated==1
file write myfile (r(mean)) _n
file close myfile

gen d_female_coauthors=asinh(num_female_coauthors_after)-asinh(num_female_coauthors_prior)
eststo: quietly reg d_female_coauthors treated

gen a_num_new_female_coauthors_after=asinh(num_new_female_coauthors_after)
gen a_num_old_female_coauthors_after=asinh(num_old_female_coauthors_after)

eststo: quietly reg a_num_new_female_coauthors_after treated
eststo: quietly reg a_num_old_female_coauthors_after treated
esttab using "fem_coauthors.csv", se replace
eststo clear

restore

preserve
keep if abs(treatment_year-pub_year)<=5
keep if pub_year>=treatment_year

collapse (max) treated (sum) num_old_coauthors num_old_female_coauthors num_coauthors num_female_coauthors, by(scientist_id)

gen num_new_female_coauthors=num_female_coauthors-num_old_female_coauthors

reg num_old_female_coauthors treated
reg num_new_female_coauthors treated

restore
